ggplotly() and plot_ly()plot_geo()We will work with two Starbucks datasets, one on the store locations (global) and one for the nutritional data for their food and drink items. We will do some text analysis of the menu items.
sb_locs <- read_csv("starbucks-locations.csv", show_col_types = FALSE)
sb_nutr <- read_csv("starbucks-menu-nutrition.csv", show_col_types = FALSE)
usa_pop <- read_csv("us_state_pop.csv", show_col_types = FALSE)
usa_states<-read_csv("states.csv", show_col_types = FALSE)
# no need to do any sort of cleaning (e.g. remove any rows),
# but check that all data are imported correctly
dim(sb_locs)
## [1] 25600 13
colnames(sb_locs)
## [1] "Brand" "Store Number" "Store Name" "Ownership Type"
## [5] "Street Address" "City" "State/Province" "Country"
## [9] "Postcode" "Phone Number" "Timezone" "Longitude"
## [13] "Latitude"
summary(sb_locs)
## Brand Store Number Store Name Ownership Type
## Length:25600 Length:25600 Length:25600 Length:25600
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Street Address City State/Province Country
## Length:25600 Length:25600 Length:25600 Length:25600
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Postcode Phone Number Timezone Longitude
## Length:25600 Length:25600 Length:25600 Min. :-159.46
## Class :character Class :character Class :character 1st Qu.:-104.67
## Mode :character Mode :character Mode :character Median : -79.35
## Mean : -27.87
## 3rd Qu.: 100.63
## Max. : 176.92
## NA's :1
## Latitude
## Min. :-46.41
## 1st Qu.: 31.24
## Median : 36.75
## Mean : 34.79
## 3rd Qu.: 41.57
## Max. : 64.85
## NA's :1
head(sb_locs)
## # A tibble: 6 × 13
## Brand `Store Number` `Store Name` `Ownership Type` `Street Address` City
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Starbucks 47370-257954 Meritxell, 96 Licensed Av. Meritxell, … Ando…
## 2 Starbucks 22331-212325 Ajman Drive … Licensed 1 Street 69, Al… Ajman
## 3 Starbucks 47089-256771 Dana Mall Licensed Sheikh Khalifa … Ajman
## 4 Starbucks 22126-218024 Twofour 54 Licensed Al Salam Street Abu …
## 5 Starbucks 17127-178586 Al Ain Tower Licensed Khaldiya Area, … Abu …
## 6 Starbucks 17688-182164 Dalma Mall, … Licensed Dalma Mall, Mus… Abu …
## # ℹ 7 more variables: `State/Province` <chr>, Country <chr>, Postcode <chr>,
## # `Phone Number` <chr>, Timezone <chr>, Longitude <dbl>, Latitude <dbl>
dim(sb_nutr)
## [1] 205 7
colnames(sb_nutr)
## [1] "Item" "Category" "Calories" "Fat (g)" "Carb. (g)"
## [6] "Fiber (g)" "Protein (g)"
summary(sb_nutr)
## Item Category Calories Fat (g)
## Length:205 Length:205 Min. : 0.0 Min. : 0.00
## Class :character Class :character 1st Qu.:130.0 1st Qu.: 0.00
## Mode :character Mode :character Median :250.0 Median : 7.00
## Mean :257.2 Mean :10.06
## 3rd Qu.:380.0 3rd Qu.:18.00
## Max. :650.0 Max. :37.00
## Carb. (g) Fiber (g) Protein (g)
## Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.:21.00 1st Qu.: 0.000 1st Qu.: 1.000
## Median :35.00 Median : 1.000 Median : 6.000
## Mean :33.97 Mean : 1.771 Mean : 8.185
## 3rd Qu.:45.00 3rd Qu.: 3.000 3rd Qu.:13.000
## Max. :80.00 Max. :21.000 Max. :34.000
head(sb_nutr)
## # A tibble: 6 × 7
## Item Category Calories `Fat (g)` `Carb. (g)` `Fiber (g)` `Protein (g)`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Chonga Bagel Food 300 5 50 3 12
## 2 8-Grain Roll Food 380 6 70 7 10
## 3 Almond Croi… Food 410 22 45 3 10
## 4 Apple Fritt… Food 460 23 56 2 7
## 5 Banana Nut … Food 420 22 52 2 6
## 6 Blueberry M… Food 380 16 53 1 6
dim(usa_pop)
## [1] 55 2
colnames(usa_pop)
## [1] "state" "population"
summary(usa_pop)
## state population
## Length:55 Min. : 56882
## Class :character 1st Qu.: 1344331
## Mode :character Median : 3751351
## Mean : 5677621
## 3rd Qu.: 6515716
## Max. :37253956
head(usa_pop)
## # A tibble: 6 × 2
## state population
## <chr> <dbl>
## 1 Alabama 4779736
## 2 Alaska 710231
## 3 Arizona 6392017
## 4 Arkansas 2915918
## 5 California 37253956
## 6 Colorado 5029196
dim(usa_states)
## [1] 51 2
colnames(usa_states)
## [1] "State" "Abbreviation"
summary(usa_states)
## State Abbreviation
## Length:51 Length:51
## Class :character Class :character
## Mode :character Mode :character
head(usa_states)
## # A tibble: 6 × 2
## State Abbreviation
## <chr> <chr>
## 1 Alabama AL
## 2 Alaska AK
## 3 Arizona AZ
## 4 Arkansas AR
## 5 California CA
## 6 Colorado CO
sb_usa <- sb_locs |> filter(
# keep only in the US
Country == 'US'
)
sb_locs_state <- sb_usa |>
group_by(
`State/Province`
) |>
rename(
state = `State/Province`
) |>
summarize(
n_stores = n()
)
# need state abbreviations
usa_pop_abbr <-
full_join(
usa_pop, usa_states,
by = join_by(state == State)
)
sb_locs_state <- full_join(
sb_locs_state,
usa_pop_abbr,
by = join_by(state == Abbreviation)
)
summary(sb_locs_state)
## state n_stores state.y population
## Length:55 Min. : 8.0 Length:55 Min. : 56882
## Class :character 1st Qu.: 56.5 Class :character 1st Qu.: 1344331
## Mode :character Median : 123.0 Mode :character Median : 3751351
## Mean : 266.8 Mean : 5677621
## 3rd Qu.: 332.0 3rd Qu.: 6515716
## Max. :2821.0 Max. :37253956
## NA's :4
ggplotly for EDAAnswer the following questions:
Are the number of Starbucks proportional to the population of a state? (scatterplot)
Is the caloric distribution of Starbucks menu items different for drinks and food? (histogram)
What are the top 20 words in Starbucks menu items? (bar plot)
p1 <- ggplot(sb_locs_state, aes(x = population, y = n_stores, color = state)) +
geom_point(alpha = 0.8)
ggplotly(p1)
p2 <- ggplot(sb_nutr, aes(x = Calories, fill = Category)) +
geom_histogram(alpha = 0.7, binwidth = 50)
ggplotly(p2)
p3<- sb_nutr |>
unnest_tokens(word, Item, token="words") |>
count(word, sort=T) |>
head(20) |>
ggplot(aes(reorder(word,n), n))+
geom_col()+
coord_flip()
ggplotly(p3)
plot_ly()plot_ly() representing the
relationship between calories and carbssb_nutr |>
plot_ly(x = ~Calories,
y = ~`Carb. (g)`,
type = "scatter",
mode = "markers",
color = ~Category)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
hovermode = "compare"topwords<- sb_nutr |>
unnest_tokens(word, Item, token="words") |>
count(word, sort=T) |>
head(10)
sb_nutr |>
unnest_tokens(word, Item, token="words") |>
filter(word %in% topwords$word) |>
plot_ly(
x = ~Calories,
y = ~`Carb. (g)`,
type = "scatter",
mode = "markers",
color = ~Category,
hoverinfo = "text",
text = ~paste0("Item: ", word, sep = "")
) |>
layout(title = "Calories vs. Carbs",
xaxis = list(title = "Calories"),
yaxis = list(title = "Carbs (g)"),
hovermode = "compare")
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
plot_ly Boxplotsfiltered_data <- sb_nutr |>
unnest_tokens(word, Item, token="words") |>
filter(word %in% topwords$word)
boxplot <- filtered_data |>
plot_ly(x = ~word, type = "box") |>
add_boxplot(y = ~Calories, boxpoints="all") |>
add_boxplot(y = ~`Fat (g)`, boxpoints="all") |>
add_boxplot(y = ~`Carb. (g)`, boxpoints="all") |>
add_boxplot(y = ~`Fiber (g)`, boxpoints="all") |>
add_boxplot(y = ~`Protein (g)`, boxpoints="all") |>
layout(boxmode = "group")
boxplot
## Warning: 'layout' objects don't have these attributes: 'boxmode'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'
filtered_data |>
plot_ly(x = ~Calories,
y = ~`Carb. (g)`,
z = ~`Protein (g)`,
type = 'scatter3d',
mode = 'markers',
color = ~word)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
plot_ly Map# Set up mapping details
set_map_details <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('steelblue')
)
# Make sure both maps are on the same color scale
shadeLimit <- 125
# Create hover text
sb_locs_state$hover <- with(sb_locs_state, paste("Number of Starbucks: ", n_stores, '<br>', "State: ", state.y, '<br>', "Population: ", population))
# Create the map
map1 <- plot_geo(sb_locs_state, locationmode = 'USA-states') |>
add_trace(z=~n_stores, text=~hover, locations=~state, color = ~n_stores, colors = 'Purples') |>
layout(title = 'starbucks stores by state', geo=set_map_details)
map1
## Warning: Ignoring 4 observations
map2 <- plot_geo(sb_locs_state, locationmode = 'USA-states') |>
add_trace(z = ~population, text = ~hover, locations = ~state, color = ~population,
colors = 'Purples') |>
layout(title = 'starbucks stores by population', geo=set_map_details)
map2
subplot(map1, map2)
## Warning: Ignoring 4 observations